library(dtwclust)
library(dtw)
df<-read.csv("../clean_data/gap_list_year.csv")
df
df <- subset(df, select = -c(X) )
df
colnames(df)[colnames(df) == "X2002"] = "2002"
colnames(df)[colnames(df) == "X2003"] = "2003"
colnames(df)[colnames(df) == "X2005"] = "2005"
colnames(df)[colnames(df) == "X2007"] = "2007"
colnames(df)[colnames(df) == "X2009"] = "2009"
colnames(df)[colnames(df) == "X2011"] = "2011"
colnames(df)[colnames(df) == "X2013"] = "2013"
colnames(df)[colnames(df) == "X2015"] = "2015"
colnames(df)[colnames(df) == "X2017"] = "2017"
colnames(df)[colnames(df) == "X2019"] = "2019"
colnames(df)[colnames(df) == "X2022"] = "2022"
df
jurisdiction = df[['Jurisdiction']]
dtw_df <- df[, -1]
dtw_df
df_lst <- tslist(dtw_df)
remove_nan <- function(ts) {
ts[!is.na(ts)]
}
# Apply the function to each time series in the list
df_lst <- lapply(df_lst, remove_nan)
df_lst
$`1`
[1] -8 -7 -6 -6 -9 -5 -8 -8 -7 -10 -10
$`2`
[1] -7 -13 -8 -9 -9 -8 -6 -6 -8 -7 -13
$`3`
[1] -11 -6 -8 -8 -6 -7 -6 -9 -5 -5 -5
$`4`
[1] -6 -9 -8 -8 -11 -5 -8 -10 -9 -5 -9
$`5`
[1] -4 -7 -7 -9 -6 -5 -6 -6 -9 -9 -7
$`6`
[1] -7 -7 -6 -5 -7 -9 -8 -7 -6 -8 -4
$`7`
[1] -7 -8 -8 -7 -9 -8 -5 -8 -5 -7 0
$`8`
[1] -4 -4 -6 -6 -5 -5 -4 -5 -9 -7 -8
$`9`
[1] -8 -8 -5 -7 -6 -7 -7 -7 -5 -9 -7
$`10`
[1] -8 -8 -9 -6 -7 -10 -6 -6 -7 -8 -10
$`11`
[1] -10 -13 -9 -11 -12 -11 -12 -9 -9 -9 -12
$`12`
[1] -8 -5 -7 -5 -9 -6 -5 -7 -5 -11 -8
$`13`
[1] -7 -5 -3 -5 -9 -5 -7 -6 -7 -6 -2
$`14`
[1] -4 -8 -8 -5 -9 -8 -7 -6 -5 -8 -3
$`15`
[1] -6 -7 -6 -6 -9 -6 -6 -5 -6 -5 -4
$`16`
[1] -8 -8 -5 -7 -4 -9 -7 -8 -5 -8 -7
$`17`
[1] -9 -8 -4 -7 -7 -4 -7 -4 -4 -6 -3
$`18`
[1] -6 -10 -3 -9 -9 -9 -8 -11 -3 -10 -9
$`19`
[1] -6 -5 -7 -5 -8 -6 -7 -8 -5 -7 -6
$`20`
[1] -6 -7 -6 -7 -6 -7 -8 -10 -9 -5 -5
$`21`
[1] -6 -6 -3 -5 -5 -5 -6 -5 -6 -10 -6
$`22`
[1] -6 -6 -5 -8 -8 -6 -7 -5 -8 -8 -6
$`23`
[1] -9 -13 -8 -4 -7 -6 -8 -11 -5 -5 -7
$`24`
[1] -6 -7 -8 -8 -5 -11 -3 -5 -7 -7 -11
$`25`
[1] -8 -7 -6 -9 -10 -10 -7 -7 -4 -8 -5
$`26`
[1] -10 -10 -5 -3 -6 -7 -6 -8 -5 -8 -9
$`27`
[1] -7 -7 -6 -6 -6 -7 -6 -7 -6 -7 -7
$`28`
[1] -7 -5 -5 -4 -5 -6 -4 -9 -7 -9 -6
$`29`
[1] -6 -9 -9 -6 -6 -6 -8 -3 -8 -7 -3
$`30`
[1] -7 -8 -7 -6 -7 -9 -6 -11 -2 -9 -7
$`31`
[1] -7 -7 -5 -6 -5 -4 -5 -7 -8 -7 -12
$`32`
[1] -7 -5 -8 -3 -10 -6 -8 -11 -8 -8 -9
$`33`
[1] -10 -8 -5 -7 -7 -7 -6 -7 -6 -5 -1
$`34`
[1] -7 -11 -8 -8 -9 -8 -7 -8 -6 -11 -6
$`35`
[1] -6 -7 -5 -5 -6 -5 -8 -2 -10 -8 -7
$`36`
[1] -5 -8 -7 -5 -5 -4 -6 -6 -4 -4 -6
$`37`
[1] -7 -7 -6 -6 -6 -6 -8 -6 -7 -4 -11
$`38`
[1] -9 -10 -7 -6 -9 -9 -9 -7 -11 -5 -7
$`39`
[1] -5 -7 -8 -7 -5 -9 -10 -8 -2 -9 -4
$`40`
[1] -5 -7 -9 -8 -10 -9 -10 -5 -5 -5 -11
$`41`
[1] -9 -8 -7 -8 -6 -10 -9 -10 -7 -8 -7
$`42`
[1] -7 -5 -8 -7 -5 -6 -8 -5 -5 -4 -11
$`43`
[1] -6 -9 -8 -6 -6 -8 -7 -10 -8 -4 -6
$`44`
[1] -4 -6 -6 -6 -6 -4 -4 -6 -3 -8 -4
$`45`
[1] -7 -9 -10 -8 -5 -5 -8 -8 -9 -8 -4
$`46`
[1] -8 -5 -7 -7 -5 -6 -10 -8 -7 -8 -4
$`47`
[1] -4 -9 -5 -6 -7 -7 -7 -5 -3 -6 -13
$`48`
[1] -7 -10 -9 -6 -9 -10 -6 -14 -6 -5 -5
$`49`
[1] -4 -8 -7 -9 -7 -9 -8 -8 -9 -10 -3
$`50`
[1] -7 -8 -5 -2 -7 -5 -9 -6 -6 -8 -9
$`51`
[1] -5 -6 -5 -6 -7 -7 -6 -5 -6 -2 -3
df_cvi <- list()
for (i in 2:10){
df_clust <- tsclust(df_lst, type = "partitional", k = i, distance = "dtw_basic", centroid = "pam")
df_metric <- cvi(df_clust, type = "valid", log.base = 10)
df_cvi <- append(df_cvi, list(df_metric))
}
df_cvi_ma <- do.call(rbind, df_cvi)
rw <- c("K2","K3","K4","K5","K6","K7","K8","K9","K10")
rownames(df_cvi_ma) <- rw
print(df_cvi_ma)
Sil SF CH DB DBstar D COP
K2 0.066258969 6.041834e-13 16.025157 2.360298 2.360298 0.08571429 0.5281549
K3 0.060935061 0.000000e+00 10.834646 2.627336 2.627336 0.09230769 0.4616402
K4 0.100837680 0.000000e+00 14.996435 1.509907 1.527303 0.24324324 0.3915600
K5 0.024228664 0.000000e+00 8.583573 1.472370 1.548017 0.17647059 0.3941870
K6 0.046777137 0.000000e+00 10.590517 1.633556 1.774753 0.12244898 0.3930014
K7 0.031945463 0.000000e+00 7.706790 1.657438 1.725652 0.13636364 0.3522119
K8 0.026268305 0.000000e+00 7.428571 1.942841 2.019564 0.13333333 0.3536732
K9 0.005328538 0.000000e+00 6.888000 1.442041 1.535972 0.22580645 0.3319078
K10 0.017396145 0.000000e+00 4.897611 1.358186 1.752563 0.10714286 0.3170716
– “Sil” (!): Silhouette index (Rousseeuw (1987); to be maximized).-K4 – “SF” (~): Score Function (Saitta et al. (2007); to be maximized; see notes). – “CH” (~): Calinski-Harabasz index (Arbelaitz et al. (2013); to be maximized).-k3 – “DB” (?): Davies-Bouldin index (Arbelaitz et al. (2013); to be minimized).k4 – “DBstar” (?): Modified Davies-Bouldin index (DB*) (Kim and Ramakrishna (2005); to be minimized). -k4 – “D” (!): Dunn index (Arbelaitz et al. (2013); to be maximized). k5 – “COP” (!): COP index (Arbelaitz et al. (2013); to be minimized). k9
# different seeds index score result
df_cvi2 <- list()
for (i in 1:100){
df_clust2 <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw_basic", centroid = "pam", seed=i)
df_metric2 <- cvi(df_clust2, type = "valid", log.base = 10)
df_cvi2 <- append(df_cvi2, list(df_metric2))
}
df_cvi_ma2 <- do.call(rbind, df_cvi2)
rw2 <- as.character(seq(1, 100))
rownames(df_cvi_ma2) <- rw2
print(df_cvi_ma2)
for (i in 2:10){df_clust_opt <- tsclust(df_lst, type = "partitional", k = i, distance = "dtw", centroid = "pam",seed = 700)
plot(df_clust_opt)}
#k4
df_clust_opt_final <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw", centroid = "pam",seed = 700)
plot(df_clust_opt_final)
for (i in 1:10){df_clust_opt_final <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw", centroid = "pam",seed = i)
plot(df_clust_opt_final)}
for (i in 11:20){df_clust_opt_final <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw", centroid = "pam",seed = i)
plot(df_clust_opt_final)}
# Extract cluster assignments
cluster_assignments <- df_clust_opt_final@cluster
# Determine the number of clusters
num_clusters <- max(cluster_assignments)
# Loop through each cluster and print the jurisdictions in it
for (cluster_number in 1:num_clusters) {
cat("Jurisdictions in Cluster", cluster_number, ":\n")
# Find the indices of jurisdictions in this cluster
indices_in_cluster <- which(cluster_assignments == cluster_number)
# Print the jurisdictions corresponding to these indices
print(jurisdiction[indices_in_cluster])
cat("\n") # Add a newline for readability
}
Jurisdictions in Cluster 1 :
[1] "California" "Colorado" "Delaware" "Indiana" "Iowa" "Maryland" "Massachusetts" "Michigan"
[9] "Missouri" "Nevada" "New Mexico" "Ohio" "Oklahoma" "Pennsylvania" "South Dakota" "Tennessee"
[17] "Texas" "Utah" "Vermont" "West Virginia" "Wyoming"
Jurisdictions in Cluster 2 :
[1] "Connecticut" "Illinois" "Kentucky" "New York"
Jurisdictions in Cluster 3 :
[1] "Alaska" "Arkansas" "Florida" "Georgia" "Idaho" "Kansas" "Maine" "Mississippi"
[9] "Montana" "National" "Nebraska" "New Hampshire" "New Jersey" "North Dakota" "South Carolina" "Virginia"
[17] "Wisconsin"
Jurisdictions in Cluster 4 :
[1] "Alabama" "Arizona" "Hawaii" "Louisiana" "Minnesota" "North Carolina" "Oregon" "Rhode Island"
[9] "Washington"
# Load necessary libraries
library(ggplot2)
library(reshape2)
# Loop through each cluster
for (cluster_number in 1:num_clusters) {
cat("Plotting jurisdictions in Cluster", cluster_number, ":\n")
# Find the indices of jurisdictions in this cluster
indices_in_cluster <- which(cluster_assignments == cluster_number)
# Get the names of the jurisdictions in this cluster
jurisdictions_in_cluster <- jurisdiction[indices_in_cluster]
# Filter the gap_list_year data frame for these jurisdictions
cluster_data <- df[df$Jurisdiction %in% jurisdictions_in_cluster, ]
# Convert the data to long format for ggplot
long_df <- melt(cluster_data, id.vars = "Jurisdiction", variable.name = "Year", value.name = "Value")
# Plot
p <- ggplot(long_df, aes(x = Year, y = Value, group = Jurisdiction, color = Jurisdiction)) +
geom_line() +
labs(title = paste("Cluster", cluster_number), x = "Year", y = "Gap") +
theme(legend.position = "right")
print(p)
ggsave(paste("cluster_", cluster_number, ".png", sep=""), plot = p)
}
Plotting jurisdictions in Cluster 1 :
Plotting jurisdictions in Cluster 2 :
Plotting jurisdictions in Cluster 3 :
Plotting jurisdictions in Cluster 4 :